by Kenny William Nyallau
The main objective of the analysis is to utilize regression-only models in order to predict medical insurance costs that would be incurred to hospital patients using their electronic health records. We will first perform the typical exploratory data analysis, then data pre-processing and finally model training and evaluation. Also, since the focus to build, evaluate and explaining the model, the exploratory data analysis will be brief.
The dataset that we will be using is provided by Brett Lantz via his github repository.
The hypothetical electronic health records attributes are the following:
age: age of primary beneficiary
sex: insurance contractor gender, female, male
bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
children: Number of children covered by health insurance / Number of dependents
smoker: Smoking
region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
charges: Individual medical costs billed by health insurance
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import warnings
warnings.filterwarnings("ignore")
import scipy.stats as stats
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
sns.set_style('whitegrid')
data = pd.read_csv('data/insurance.csv')
data.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
data.dtypes
age int64 sex object bmi float64 children int64 smoker object region object charges float64 dtype: object
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
df = data.copy()
#we will encode the categorical features smokers and sex with the respective binary values
smokers = {'no': 0, 'yes': 1}
sex = {'female': 0, 'male': 1}
df['smoker'] = df['smoker'].map(smokers)
df['sex'] = df['sex'].map(sex)
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | southwest | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | southeast | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | southeast | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | northwest | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | northwest | 3866.85520 |
df.describe()
| age | sex | bmi | children | smoker | charges | |
|---|---|---|---|---|---|---|
| count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
| mean | 39.207025 | 0.505232 | 30.663397 | 1.094918 | 0.204783 | 13270.422265 |
| std | 14.049960 | 0.500160 | 6.098187 | 1.205493 | 0.403694 | 12110.011237 |
| min | 18.000000 | 0.000000 | 15.960000 | 0.000000 | 0.000000 | 1121.873900 |
| 25% | 27.000000 | 0.000000 | 26.296250 | 0.000000 | 0.000000 | 4740.287150 |
| 50% | 39.000000 | 1.000000 | 30.400000 | 1.000000 | 0.000000 | 9382.033000 |
| 75% | 51.000000 | 1.000000 | 34.693750 | 2.000000 | 0.000000 | 16639.912515 |
| max | 64.000000 | 1.000000 | 53.130000 | 5.000000 | 1.000000 | 63770.428010 |
f"We have {data.shape[0]} training examples and {data.shape[1]} indepedent variables"
'We have 1338 training examples and 7 indepedent variables'
df.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
The data is immaculate as we have no missing values.
label_encoder = LabelEncoder()
df['region']= label_encoder.fit_transform(df['region'])
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 |
plt.figure(figsize=(14,10))
sns.heatmap(df.corr(), cmap='flare', annot=True)
plt.show()
plt.figure(figsize=(14,12))
sns.pairplot(df, kind="reg",hue="sex",aspect=2, palette='viridis')
plt.show()
<Figure size 1008x864 with 0 Axes>
plt.figure(figsize=(14,12))
sns.pairplot(df, kind="reg",hue="smoker",aspect=2, palette='inferno')
plt.show()
<Figure size 1008x864 with 0 Axes>
plt.figure(figsize=(12,10))
sns.barplot(x='region', y='charges', hue='sex', data=df, palette='flare')
plt.show()
plt.figure(figsize=(12,10))
sns.barplot(x='region', y='bmi', hue='smoker', data=df, palette='Set1')
plt.show()
f"The minimum age in our dataset is {df['age'].min()} years old and the maximum is {df['age'].max()} years old."
'The minimum age in our dataset is 18 years old and the maximum is 64 years old.'
conditions_age = [
(df['age'] > 10) & (df['age'] <= 20),
(df['age'] > 20) & (df['age'] <= 30),
(df['age'] > 30) & (df['age'] <= 40),
(df['age'] > 40) & (df['age'] <= 50),
(df['age'] > 50) & (df['age'] <= 60),
(df['age'] > 60) & (df['age'] <= 70)]
values_age = ['under_20', 'under_30', 'under_40', 'under_50', 'under_60', 'under_70']
df['age_category'] = np.select(conditions_age, values_age)
df.head()
| age | sex | bmi | children | smoker | region | charges | age_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 | under_20 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 | under_20 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 | under_30 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 | under_40 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 | under_40 |
plt.figure(figsize=(12,10))
sns.barplot(x='age_category', y='bmi', hue='sex', data=df, palette='inferno')
plt.show()
plt.figure(figsize=(12,10))
sns.barplot(x='age_category', y='charges', hue='sex', data=df, palette='Set2')
plt.show()
As we can see, the charges for the patients increase linearly based on the patients' age. Also, the charges for male are generally higher than female patients.
print(f" Mean of charges: {round(df['charges'].mean(),2)}")
print(f" Minimum of charges: {round(df['charges'].min(),2)}")
print(f" Maximum of charges: {round(df['charges'].max(),2)}")
Mean of charges: 13270.42 Minimum of charges: 1121.87 Maximum of charges: 63770.43
plt.figure(figsize=(12,5))
sns.distplot(df['charges'], kde = True, color="purple")
plt.title('Charges distribution')
plt.show()
We can observe that the "charges" distribution is right-skewed. We can apply either Box-Cox transformation and log tranformation in order to normalize the distribution.
plt.figure(figsize=(12,5))
bc_charges, lambda_param = stats.boxcox(df['charges'])
sns.distplot(bc_charges, color="brown")
plt.show()
print(f"The lambda parameter for Box Cox transformation: {lambda_param}")
The lambda parameter for Box Cox transformation: 0.043649053770664956
plt.figure(figsize=(12,5))
stats.probplot(fitted_charges, dist="norm", plot=pylab)
pylab.show()
plt.figure(figsize=(12,5))
sns.distplot(np.log(df['charges']))
plt.show()
plt.figure(figsize=(12,5))
stats.probplot(np.log(df['charges']), dist="norm", plot=pylab)
pylab.show()
plt.figure(figsize=(12,5))
sns.distplot(df['age'], kde = True, color="green")
plt.show()
plt.figure(figsize=(10,10))
sns.catplot(x="sex", y="charges", data=df, kind="box", aspect=2, palette='viridis')
plt.show()
<Figure size 720x720 with 0 Axes>
plt.figure(figsize=(12,5))
sns.catplot(x="region", y="charges", kind="bar", data=df, aspect=2, palette='flare')
plt.show()
<Figure size 864x360 with 0 Axes>
_, ax = plt.subplots(1,1, figsize=(12,8))
ax = sns.barplot(x = 'region', y = 'charges', hue='smoker', data=df, palette='Reds_r')
plt.show()
conditions = [
(df['bmi'] <=16),
(df['bmi'] > 16) & (df['bmi'] <= 17),
(df['bmi'] > 17) & (df['bmi'] <= 18.5),
(df['bmi'] > 18.5) & (df['bmi'] <= 25),
(df['bmi'] > 25) & (df['bmi'] <= 35),
(df['bmi'] > 35) & (df['bmi'] <= 40),
(df['bmi'] > 40)]
values = ['severe_thinness', 'moderate_thinness', 'mild_thinness', 'normal', 'obese_i', 'obese_ii', 'obese_iii']
df['bmi_category'] = np.select(conditions, values)
df.head()
| age | sex | bmi | children | smoker | region | charges | age_category | bmi_category | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 | under_20 | obese_i |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 | under_20 | obese_i |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 | under_30 | obese_i |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 | under_40 | normal |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 | under_40 | obese_i |
sns.catplot(x="bmi_category", y="charges", kind="bar", data=df, aspect=2, palette='inferno')
plt.title('Charges based on BMI categories')
plt.show()
plt.figure(figsize=(14,6))
sns.boxplot(x='bmi_category', y='bmi',hue='sex', data=df,palette='viridis')
plt.title('Charges vs children')
plt.show()
plt.figure(figsize=(14,6))
sns.boxplot(x='children', y='charges', hue='sex', data=df, palette='viridis')
plt.title('Charges vs children')
plt.show()
plt.figure(figsize=(14,6))
ax = sns.violinplot(x = 'children', y = 'charges', data=df, orient='v', hue='sex', palette='inferno')
plt.title("Charges based on children's gender")
plt.show()
feats_all =['age','sex', 'bmi', 'children', 'smoker', 'region']
X = pd.DataFrame(df, columns= feats_all)
#We would like to predict the charges
y = pd.DataFrame(df, columns=['charges'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pipeline = Pipeline([
('scaler',StandardScaler()),
('model',Lasso())
])
search = GridSearchCV(pipeline,
{'model__alpha':np.arange(0.1,10,0.1)},
cv = 5, scoring="neg_mean_squared_error",
verbose=3)
search.fit(X_train,y_train)
] END ...............................model__alpha=5.0; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.0; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.1; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.1; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.1; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.1; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.1; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.2; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.2; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.2; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.2; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.2; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.3; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.3; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.3; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.3; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.3; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.4; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.4; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.4; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.4; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.4; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.5; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.5; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.5; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.5; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.5; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.6; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.6; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.6; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.6; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.6; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.7; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.7; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.7; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.7; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.7; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.8; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.8; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.8; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.8; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.8; total time= 0.0s [CV 1/5] END ...............................model__alpha=5.9; total time= 0.0s [CV 2/5] END ...............................model__alpha=5.9; total time= 0.0s [CV 3/5] END ...............................model__alpha=5.9; total time= 0.0s [CV 4/5] END ...............................model__alpha=5.9; total time= 0.0s [CV 5/5] END ...............................model__alpha=5.9; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.0; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.0; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.0; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.0; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.0; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.1; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.1; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.1; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.1; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.1; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.2; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.2; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.2; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.2; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.2; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.3; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.3; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.3; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.3; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.3; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.4; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.4; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.4; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.4; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.4; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.5; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.5; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.5; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.5; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.5; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.6; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.6; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.6; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.6; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.6; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.7; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.7; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.7; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.7; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.7; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.8; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.8; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.8; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.8; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.8; total time= 0.0s [CV 1/5] END ...............................model__alpha=6.9; total time= 0.0s [CV 2/5] END ...............................model__alpha=6.9; total time= 0.0s [CV 3/5] END ...............................model__alpha=6.9; total time= 0.0s [CV 4/5] END ...............................model__alpha=6.9; total time= 0.0s [CV 5/5] END ...............................model__alpha=6.9; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.0; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.0; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.0; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.0; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.0; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.1; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.1; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.1; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.1; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.1; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.2; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.2; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.2; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.2; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.2; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.3; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.3; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.3; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.3; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.3; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.4; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.4; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.4; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.4; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.4; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.5; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.5; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.5; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.5; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.5; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.6; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.6; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.6; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.6; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.6; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.7; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.7; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.7; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.7; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.7; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.8; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.8; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.8; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.8; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.8; total time= 0.0s [CV 1/5] END ...............................model__alpha=7.9; total time= 0.0s [CV 2/5] END ...............................model__alpha=7.9; total time= 0.0s [CV 3/5] END ...............................model__alpha=7.9; total time= 0.0s [CV 4/5] END ...............................model__alpha=7.9; total time= 0.0s [CV 5/5] END ...............................model__alpha=7.9; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.0; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.0; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.0; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.0; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.0; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.1; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.1; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.1; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.1; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.1; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.2; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.2; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.2; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.2; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.2; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.3; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.3; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.3; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.3; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.3; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.4; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.4; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.4; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.4; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.4; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.5; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.5; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.5; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.5; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.5; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.6; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.6; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.6; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.6; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.6; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.7; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.7; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.7; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.7; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.7; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.8; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.8; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.8; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.8; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.8; total time= 0.0s [CV 1/5] END ...............................model__alpha=8.9; total time= 0.0s [CV 2/5] END ...............................model__alpha=8.9; total time= 0.0s [CV 3/5] END ...............................model__alpha=8.9; total time= 0.0s [CV 4/5] END ...............................model__alpha=8.9; total time= 0.0s [CV 5/5] END ...............................model__alpha=8.9; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.0; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.0; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.0; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.0; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.0; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.1; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.1; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.1; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.1; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.1; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.2; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.2; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.2; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.2; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.2; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.3; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.3; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.3; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.3; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.3; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.4; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.4; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.4; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.4; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.4; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.5; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.5; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.5; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.5; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.5; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.6; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.6; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.6; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.6; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.6; total time= 0.0s [CV 1/5] END .................model__alpha=9.700000000000001; total time= 0.0s [CV 2/5] END .................model__alpha=9.700000000000001; total time= 0.0s [CV 3/5] END .................model__alpha=9.700000000000001; total time= 0.0s [CV 4/5] END .................model__alpha=9.700000000000001; total time= 0.0s [CV 5/5] END .................model__alpha=9.700000000000001; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.8; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.8; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.8; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.8; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.8; total time= 0.0s [CV 1/5] END ...............................model__alpha=9.9; total time= 0.0s [CV 2/5] END ...............................model__alpha=9.9; total time= 0.0s [CV 3/5] END ...............................model__alpha=9.9; total time= 0.0s [CV 4/5] END ...............................model__alpha=9.9; total time= 0.0s [CV 5/5] END ...............................model__alpha=9.9; total time= 0.0s
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('model', Lasso())]),
param_grid={'model__alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5,
6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8,
7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1,
9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9])},
scoring='neg_mean_squared_error', verbose=3)
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
importance
array([3684.42039571, 45.86470378, 2053.73121995, 505.63802738,
9583.07263043, 351.93449591])
As we can see from the feature importance result, all of the six features were presented as important for training and none were discarded.
As general rule of thumb, we need to ensure that our regression model does not have issues with multicollinearity in which the independent variables do not high corellation between each other. The most common trick is to reduce multicollinearity is to use Variance Inflation Factor(VIF). We will call the stats' variance_inflation_factor method and detect every feature for multicollinearity.
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)
calculate_vif(X)
feature VIF 0 age 7.551348 1 sex 2.001061 2 bmi 10.371829 3 children 1.801245 4 smoker 1.256837 5 region 2.924528
We can observe that there are two features (age and bmi) that have VIF higher than 5. That's means these variables have high mulitcollinearity. There are a few ways to deal with mulitcollinearity and one of them is to remove one of the features from our data. So let's try to remove the feature "age" and see if the high mulitcollinearity still exists.
trim_feats =['sex', 'bmi', 'children', 'smoker', 'region']
X = pd.DataFrame(df, columns= trim_feats)
calculate_vif(X)
feature VIF 0 sex 1.999080 1 bmi 4.645412 2 children 1.783128 3 smoker 1.256722 4 region 2.922216
That looks good but now the model will suffer from bad performance because we have reduced the interpretability of the model of what seemed to be an important feature. To avoid dropping features, we can use one hot encoding to fix this instead.
df2 = data.copy()
cat_columns = ['sex', 'children', 'smoker', 'region']
df_dummy = pd.get_dummies(
data = df2,
prefix = 'dummy',
prefix_sep='_',
columns = cat_columns,
drop_first =True,
dtype='int8')
df_dummy.head()
| age | bmi | charges | dummy_male | dummy_1 | dummy_2 | dummy_3 | dummy_4 | dummy_5 | dummy_yes | dummy_northwest | dummy_southeast | dummy_southwest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 27.900 | 16884.92400 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1 | 18 | 33.770 | 1725.55230 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 28 | 33.000 | 4449.46200 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 33 | 22.705 | 21984.47061 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 32 | 28.880 | 3866.85520 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
df_dummy.shape
(1338, 13)
calculate_vif(X)
feature VIF 0 sex 1.999080 1 bmi 4.645412 2 children 1.783128 3 smoker 1.256722 4 region 2.922216
As you can see, we have effectively removed the multicollinearity without removing any features from our dataset.
As mentioned above, we are going to perform the prediction task based on regression models rather than using other classifiers such as clustering or decision trees. In this case, we will start with the base model Linear regression, followed by Ridge regression, Lasso regression and finally Polynomial regression. At the same time, we will compare each of their performances using metrics such as R-Square and Mean Square Error.
#We need to make sure that the random state is the same all across train test split
random_state = 42
#X = pd.DataFrame(df, columns= feats)
X = df_dummy.drop('charges', axis=1)
#We would like to predict the charges from the normalized distribution of charges data
y = np.log(df_dummy.charges)
#y = pd.DataFrame(np.log(df['charges']), columns=['charges'])
#y = pd.DataFrame(df, columns=['charges'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=random_state)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
Linear regression is a supervised learning algorithm that describes the relationship between some number of independent variable X and an independent variable Y in order to predict a continous numerical outcome using a best git straight line. The formula for the linear model is:
A simple linear regression can be described as the following fomula:
$y = {\beta} _{0} + {\beta}_{1}{x}$
Whereby:
$y$ is the target output
$x$ is the feature
${\beta} _{0}$ is the intercept
${\beta} _{1}$ is the coefficent of $x$
When we are dealing with multiple features then we can simply use multiple regression which is just another extension of linear regression:
$y = {\beta} _{0} + {\beta}_{1}{x}_{1} + {...} + {\beta}_{n}{x}_{n}$
where ${n}$ is the number of features
lr = LinearRegression()
lr.fit(X_train,y_train)
LinearRegression()
print(f"Intercept: {lr.intercept_}")
print(f"Coefficient: {lr.coef_}")
Intercept: 9.109006822397197 Coefficient: [ 0.48800986 0.08223213 -0.04014345 0.06244393 0.10109127 0.06827412 0.07820317 0.04668392 0.63256921 -0.03055681 -0.07201095 -0.05408895]
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
lr_pred = lr.predict(X_test)
plt.figure(figsize=(12,6))
sns.regplot(x=lr_pred, y=y_test, color='black')
plt.xlabel("Predicted charges")
plt.show()
For regression tasks, we will typically use the following metrics:
Mean Absolute Error (MAE):
$\frac{1}{n} \sum_{i = 1}^{n}{|{y}_{i} - \hat{y}_{i}|}$
Mean Squared Error (MSE):
$\frac{1}{n} \sum_{i = 1}^{n}{({y}_{i} - \hat{y}_{i})^2}$
Root Mean Squared Error (RMSE):
$\sqrt{\frac{1}{n} \sum_{i = 1}^{n}{({y}_{i} - \hat{y}_{i})^2}}$
We will apply those evaluation metrics to the rest of the regression models.
mae_lr = mean_absolute_error(y_test, lr_pred)
mse_lr = mean_squared_error(y_test, lr_pred)
rmse_lr = np.sqrt(mean_squared_error(y_test, lr_pred))
r_square_lr = r2_score(y_test, lr_pred)
print([mae_lr, mse_lr, rmse_lr, r_square_lr])
[0.27231892500548666, 0.18051229758893123, 0.42486738823888476, 0.7844372891704955]
Ridge regression is a model tuning method that is being used to perform analysis data that has high multicollinearity in multiple regression data.
ridge = Ridge(alpha=0.5)
ridge.fit(X_train, y_train)
Ridge(alpha=0.5)
ridge_pred = ridge.predict(X_test)
print(ridge.intercept_)
print(ridge.coef_)
print(r2_score(y_test, ridge_pred))
9.109006822397197 [ 0.48773964 0.08219441 -0.04010253 0.06237461 0.10102103 0.06825144 0.07813278 0.04662345 0.63221263 -0.03048915 -0.07190198 -0.05400146] 0.784436067338875
mae_ridge = mean_absolute_error(y_test, ridge_pred)
mse_ridge = mean_squared_error(y_test, ridge_pred)
rmse_ridge = np.sqrt(mean_squared_error(y_test, ridge_pred))
print([mae_ridge, mse_ridge, rmse_ridge])
[0.27238941389740223, 0.18051332075120372, 0.4248685923332104]
plt.figure(figsize=(12,6))
sns.regplot(x=ridge_pred, y=y_test, color='red')
plt.xlabel("Predicted charges")
plt.show()
Lasso or Least Absolute Shrinkage and Selection Operator serves as method for feature selection and model regularization. It reguralizes model parameters by shrinking the coefficients and reducing them down to zero.
lass = Lasso(alpha=0.2, fit_intercept=True, normalize=False, precompute=False, max_iter=1000,
tol=0.0001, warm_start=False, positive=False, selection='cyclic')
lass.fit(X_train, y_train)
print(lass.intercept_)
print(lass.coef_)
9.109006822397197 [ 0.29404127 0. -0. 0. 0. 0. 0. 0. 0.42117713 -0. -0. -0. ]
lass_pred = lass.predict(X_test)
lass_mse = mean_squared_error(lass_pred, y_test)
lass_rsquare = r2_score(y_test,lass_pred)
print(f"Mean squared error: {lass_mse}")
print(f"R-square: {lass_rsquare}")
Mean squared error: 0.29041816520526403 R-square: 0.6531907919739627
plt.figure(figsize=(12,6))
sns.regplot(x=lass_pred, y=y_test, color='blue')
plt.xlabel("Predicted charges")
plt.show()
If the relationship between the variables are not linear but the data is correlated, then Linear regression might not be the best solution. In this case, Polynomial Regression can be used to achieve minimum error or minimum cost function as well being able to fit a polynomial line. However, it maybe sensitive to outliers.
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = random_state)
poly_lr = LinearRegression().fit(X_train,y_train)
y_train_pred = poly_lr.predict(X_train)
y_test_pred = poly_lr.predict(X_test)
poly_lr_pred = poly_lr.predict(X_test)
from sklearn.metrics import mean_squared_error
poly_lr_mse = mean_squared_error(poly_lr_pred, y_test)
poly_lr_rsquare = poly_lr.score(X_test,y_test)
print(f"Mean squared error: {poly_lr_mse}")
print(f"R-square: {poly_lr_rsquare}")
Mean squared error: 0.14409268805977918 R-square: 0.8332268249646304
plt.figure(figsize=(12,6))
sns.regplot(x=poly_lr_pred, y=y_test, color='green')
plt.xlabel("Predicted charges")
plt.show()